Next we can look at getting the entire full text of a collection, some of our collections are very large and are slow to do analysis on so with this tool I suggest you use one of our smaller full text collections like:
In [ ]:
import json, requests, math, re, string, nltk
nltk.download("punkt") # Word tokenizer
nltk.download("stopwords") # Stop words
from nltk import word_tokenize
In [ ]:
# Collection to get data from
collection = 'darwin'
# Query the API for the collection information
collectionUrl = 'https://oc-index.library.ubc.ca/collections/'+collection
apiResponse = requests.get(collectionUrl).json()
# Get count of items in collection
itemCount = apiResponse['data']['items']
# Get collection name
collectionTitle = apiResponse['data']['title']
'Collection: "' + collectionTitle + '" has ' + str(itemCount) + ' items'
In [ ]:
perPage = 25
offset = 0
pages = math.ceil(itemCount / 25)
# Loop through collection item pages to get all items
itemIds = []
for x in range(0, pages):
collectionItemsUrl = 'https://oc-index.library.ubc.ca/collections/'+collection+'/items?limit='+str(perPage)+'&offset='+str(offset)
offset += 25
# Get list of 25 items
apiResponse = requests.get(collectionItemsUrl).json()
collectionItems = apiResponse['data']
# Add each item id to the itemIds list
for collectionItem in collectionItems:
itemIds.append(collectionItem['_id'])
print(itemIds)
In [ ]:
items = []
fullTexts = []
for itemId in itemIds :
itemUrl = 'https://oc-index.library.ubc.ca/collections/'+collection+'/items/'+itemId
apiResponse = requests.get(itemUrl).json()
item = apiResponse['data']
items.append(item)
if 'FullText' in item:
fullText = item['FullText'][0]['value']
# Lower case full text
cleanFullText = fullText.lower()
# Remove everything but words
pattern = re.compile('[\W_]+')
cleanFullText = pattern.sub(' ', cleanFullText)
# Add to the full texts list
fullTexts.append(cleanFullText)
else:
fullTexts.append('')
'Done'
In [ ]:
characterLength = 0
for fullText in fullTexts:
characterLength += len(fullText)
characterLength
In [ ]:
tokens = []
for fullText in fullTexts:
tokens += word_tokenize(fullText)
len(tokens)
In [ ]:
len(set(tokens))
In [ ]:
len(tokens)/len(set(tokens))
In [ ]:
search = "will"
In [ ]:
text = nltk.Text(tokens)
text.count(search)
In [ ]:
100.0*fullText.count(search)/len(fullText)
In [ ]:
text.concordance(search)
In [ ]:
text.similar(search)
In [ ]:
v = set(text)
long_words = [word for word in v if len(word) > 15]
sorted(long_words)
In [ ]:
text.collocations()
In [ ]:
import numpy
# allow visuals to show up in this interface-
% matplotlib inline
text.dispersion_plot([search])
In [ ]:
from nltk import FreqDist
fdist = FreqDist(text)
fdist.most_common(50)
In [ ]:
fdist.plot(25)
In [ ]: